In [1]:
import words
reload(words)


Out[1]:
<module 'words' from 'words.pyc'>

In [67]:
CACHE = False

import samples
reload(samples)
data = samples.load_samples(["Keywords", "UK", "Georgia", "Mexico", "EU"], cache=CACHE)
keywords = samples.load_samples(["Keywords"], cache=CACHE)
canada = samples.load_samples(["Canada"], cache=CACHE)
moldova = samples.load_samples(["Moldova"], cache=CACHE)
unops = samples.load_samples(["UNOPS"], cache=CACHE)
entities = list( set(x['entity'] for x in keywords) )

entities


Out[67]:
['notice',
 'good',
 'solicitation',
 'contract',
 'supplier',
 'authority',
 'buyer',
 '?']

In [3]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn import cross_validation

In [58]:
slices = {}
for i, row in enumerate(data):
    slices.setdefault(row['sample'], []).append(i)

I. Define Raw Data


In [5]:
def organize_data(data):
    organized = []
    for k, headers in data.items():
        for header in headers:
            organized.append({'entity': k, 'header': header})
    return organized

II. Define Features


In [11]:
def length(df):
    return df['header'].apply(len)

def word_count(df):
    return df['header'].apply(lambda x: len(list(words.split_words(x))))

def header_in_entity(df):
    return df['header'].str.lower().isin(df['entity'].str.lower())

def entity_in_header(df):
    return df['entity'].str.lower().isin(df['header'].str.lower())

def entity_feature(name):
    entity_set = set(x['header'] for x in data if x['entity'] == name)
    
    def fn(x):
        #print name, x, words.subsetness(x, entity_set)
        try:
            return words.subsetness(x, entity_set)
        except:
            return 0
    
    def entity_feature(df):
        return df['header'].apply( fn )
    
    entity_feature.func_name = 'entity_%s' % name    
    return entity_feature

entity_features = [entity_feature(name) for name in entities]

In [11]:

III. Combine Features into Feature Matrix & Define Outcome

IV. Create Model

V. Split Data into Test and Training

Fit and Test Models


In [85]:
from sklearn.ensemble import RandomForestClassifier

class Model(object):
    def __init__(self, samples, outcome_key='entity', svm=RandomForestClassifier(n_estimators=10)):
        self.samples = samples
        self.svm = svm
        self.frame = pd.DataFrame(self.samples)
        self.outcome_key = outcome_key
        self.features_built = set()
    
    def test(self, features, iterations=5, train_size=0.35, test_size=.25, seed=0): 
        X = self.build(self.frame, features)
        y = self.frame[self.outcome_key]
        
        rs = cross_validation.ShuffleSplit(len(X), n_iter=iterations, train_size=train_size, test_size=test_size, random_state=seed)
        
        accuracies = []
        for train_index, test_index in rs:
            model = self.svm.fit(X.ix[train_index], y.ix[train_index])
            actual = y.ix[test_index].values
            predicted = model.predict(X.ix[test_index])
            accuracies.append( self.score_model(actual, predicted) )
        
        print "Avg Accuracy: %%%.2f" % np.mean(accuracies)
    
    def test_sample(self, slice, features):
        X = self.build(self.frame, features)
        y = self.frame[self.outcome_key]
        
        model = self.svm.fit(X, y)
        actual = y.ix[slice].values
        predicted = model.predict(X.ix[slice])
        accuracy = self.score_model(actual, predicted)
        
        for i, a, p in zip(slice, actual, predicted):
            print self.samples[i]['header'].ljust(50), a.ljust(20), p
        
        print "Accuracy: %%%.2f" % accuracy
    
    def test_data(self, data, features):
        X = self.build(self.frame, features)
        y = self.frame[self.outcome_key]
        
        model = self.svm.fit(X, y)
        
        df = pd.DataFrame(data)
        z = self.build(df, features)
        
        actual = df.entity
        predicted = model.predict(z)
        accuracy = self.score_model(actual, predicted)
        
        for dct, a, p in zip(data, actual, predicted):
            print dct['header'].ljust(50), a.ljust(20), p
        
        print "Accuracy: %%%.2f" % accuracy
    
    def score_model(self, actual, predicted):
        score_df = pd.DataFrame([actual, predicted], index=['actual', 'predicted']).T
        correct = sum(score_df.actual == score_df.predicted)
        incorrect = sum(score_df.actual != score_df.predicted)
        total = correct + incorrect
        accuracy = float(correct) / float(total) * 100
        return accuracy
    
    def predict(self, headers, features):
        X = self.build(self.frame, features)
        y = self.frame[self.outcome_key]
        model = self.svm.fit(X, y)
        
        data = [{'header': h, 'entity': '?'} for h in headers]
        df = pd.DataFrame(data)
        z = self.build(df, features)
        
        self.df = df
        self.z = z
        
        predictions = model.predict(z)
        return zip(headers, predictions)
    
    def build(self, df, features):
        result = pd.DataFrame()
        for fn in features:
            result[fn.func_name] = fn(df)
        return result

model = Model(data)

In [13]:
model.test(features=[length, word_count])


Avg Accuracy: %37.95

In [14]:
model.test(features=[length, word_count, header_in_entity, entity_in_header])


Avg Accuracy: %49.09

In [86]:
model.test(features=[length, word_count, header_in_entity, entity_in_header] + entity_features)


Avg Accuracy: %79.77

In [16]:
model.test(features=entity_features)


Avg Accuracy: %78.64

Predict a dataset


In [65]:
model.test_sample(slices['Canada'], features= [length, word_count, header_in_entity, entity_in_header] + entity_features)


language                                           ?                    ?
procurement_entity_name                            authority            authority
title                                              good                 good
reference_number                                   notice               notice
solicitation_number                                solicitation         solicitation
contract_sequence_number                           contract             contract
contract                                           contract             notice
publishing_status                                  solicitation         solicitation
award_date                                         contract             contract
publication_date                                   notice               notice
amendment_date                                     notice               notice
gsin                                               good                 good
contract_award_procedure                           solicitation         solicitation
tendering_procedure                                solicitation         solicitation
procurement_entity                                 authority            authority
end_user_entity                                    buyer                buyer
customer_info                                      buyer                buyer
description                                        good                 good
supplier_info                                      supplier             buyer
currency                                           solicitation         solicitation
currency                                           notice               contract
currency                                           contract             contract
contract_value                                     solicitation         solicitation
contract_value                                     notice               contract
contract_value                                     contract             contract
Accuracy: %84.00

In [87]:
model.test_data(canada, features= [length, word_count, header_in_entity, entity_in_header] + entity_features)


language                                           ?                    ?
procurement_entity_name                            authority            buyer
title                                              good                 good
reference_number                                   notice               ?
solicitation_number                                solicitation         ?
contract_sequence_number                           contract             authority
contract_number                                    contract             solicitation
publishing_status                                  solicitation         solicitation
award_date                                         contract             solicitation
publication_date                                   notice               solicitation
amendment_date                                     notice               solicitation
gsin                                               good                 good
contract_award_procedure                           solicitation         ?
tendering_procedure                                solicitation         solicitation
procurement_entity                                 authority            ?
end_user_entity                                    buyer                solicitation
customer_info                                      buyer                solicitation
description                                        good                 good
supplier_info                                      supplier             solicitation
currency                                           solicitation         solicitation
currency                                           notice               solicitation
currency                                           contract             solicitation
contract_value                                     solicitation         solicitation
contract_value                                     notice               solicitation
contract_value                                     contract             solicitation
Accuracy: %32.00

In [37]:
model.df.join(model.z)


Out[37]:
entity header length word_count header_in_entity entity_in_header entity_notice entity_good entity_solicitation entity_contract entity_supplier entity_authority entity_buyer entity_?
0 ? language 8 6 False False 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
1 ? procurement_entity_name 23 17 False False 0.882353 0.882353 0.882353 0.882353 0.882353 0.882353 0.941176 0.882353
2 ? title 5 12 False False 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 1.000000
3 ? reference_number 16 28 False False 0.035714 0.607143 1.000000 0.071429 0.000000 0.000000 0.000000 1.000000
4 ? solicitation_number 19 20 False False 0.050000 0.850000 0.850000 0.050000 0.000000 0.000000 0.000000 0.900000
5 ? contract_sequence_number 24 36 False False 0.361111 0.805556 0.805556 0.361111 0.000000 0.000000 0.000000 0.805556
6 ? contract_number 15 29 False False 0.448276 1.000000 1.000000 0.448276 0.000000 0.000000 0.000000 1.000000
7 ? publishing_status 17 6 False False 0.333333 0.000000 1.000000 0.500000 0.000000 0.000000 0.000000 0.000000
8 ? award_date 10 18 False False 1.000000 0.277778 1.000000 1.000000 0.277778 0.277778 0.000000 0.277778
9 ? publication_date 16 17 False False 0.764706 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000
10 ? amendment_date 14 15 False False 0.866667 0.000000 0.866667 0.866667 0.000000 0.000000 0.000000 0.000000
11 ? gsin 4 16 False False 0.000000 1.000000 0.437500 0.000000 0.000000 0.000000 0.000000 0.000000
12 ? contract_award_procedure 24 21 False False 0.809524 0.809524 1.000000 0.809524 0.238095 0.285714 0.000000 1.000000
13 ? tendering_procedure 19 8 False False 0.500000 0.000000 1.000000 0.500000 0.125000 0.125000 0.000000 0.625000
14 ? procurement_entity 18 2 False False 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.500000 0.000000
15 ? end_user_entity 15 22 False False 0.000000 0.000000 0.818182 0.818182 0.000000 0.000000 0.045455 0.000000
16 ? customer_info 13 2 False False 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.500000
17 ? description 11 3 False False 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
18 ? supplier_info 13 2 False False 0.000000 0.000000 0.000000 0.000000 0.500000 0.000000 0.000000 0.500000
19 ? currency 8 3 False False 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000
20 ? currency 8 3 False False 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000
21 ? currency 8 3 False False 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000
22 ? contract_value 14 23 False False 1.000000 0.521739 1.000000 1.000000 0.000000 0.000000 0.000000 0.521739
23 ? contract_value 14 23 False False 1.000000 0.521739 1.000000 1.000000 0.000000 0.000000 0.000000 0.521739
24 ? contract_value 14 23 False False 1.000000 0.521739 1.000000 1.000000 0.000000 0.000000 0.000000 0.521739

25 rows × 14 columns


In [19]:
model = Model(keywords)
results = model.predict(headers, features = [length, word_count, header_in_entity, entity_in_header] + entity_features)

for header, result in results:
    print header.ljust(50), result


language                                           supplier
procurement_entity_name                            ?
title                                              supplier
reference_number                                   solicitation
solicitation_number                                solicitation
contract_sequence_number                           solicitation
contract_number                                    solicitation
publishing_status                                  solicitation
award_date                                         solicitation
publication_date                                   ?
amendment_date                                     ?
gsin                                               good
contract_award_procedure                           ?
tendering_procedure                                solicitation
procurement_entity                                 solicitation
end_user_entity                                    solicitation
customer_info                                      solicitation
description                                        buyer
supplier_info                                      solicitation
currency                                           buyer
currency                                           buyer
currency                                           buyer
contract_value                                     ?
contract_value                                     ?
contract_value                                     ?

In [ ]:
[obj['header'] for obj in unops]